Cargando librerias
library(viridis)
## Warning: package 'viridis' was built under R version 4.2.3
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.2.3
library(Biostrings)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
library(DECIPHER)
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
## Loading required package: parallel
library(ade4)
## Warning: package 'ade4' was built under R version 4.2.3
##
## Attaching package: 'ade4'
## The following object is masked from 'package:Biostrings':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
library(seqinr)
## Warning: package 'seqinr' was built under R version 4.2.3
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
library(adegenet)
## Warning: package 'adegenet' was built under R version 4.2.3
##
## /// adegenet 2.1.10 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(ape)
## Warning: package 'ape' was built under R version 4.2.3
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
## The following object is masked from 'package:Biostrings':
##
## complement
library(ggtree)
## ggtree v3.6.2 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
##
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
##
## rotate
## The following object is masked from 'package:Biostrings':
##
## collapse
## The following object is masked from 'package:IRanges':
##
## collapse
## The following object is masked from 'package:S4Vectors':
##
## expand
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
Guardando las variantes en variables
# NC_045512.2 Covid 19
# NC_004718.3 sarscov
# OX008586 Covid beta
# OW998408 covid alfa
# OX014251 covid gamma
# MZ937000 Covid murcielago
# KP849472 Covid perro
cov2 = read.GenBank("NC_045512.2")
sarsCov = read.GenBank("NC_004718.3")
covBeta = read.GenBank("OX008586.1")
covAlpha = read.GenBank("OW998408")
covGamma = read.GenBank("OX014251")
covOmicron = read.GenBank("OW996240")
covMurcielago = read.GenBank("MZ937000")
covCanino = read.GenBank("KP849472")
Guardando todas las secuencias en un archivo fasta
secuencias <- c(cov2, sarsCov, covBeta, covAlpha, covGamma, covOmicron, covMurcielago, covCanino)
write.dna(secuencias, file = "todas_secuencias.fasta", format = "fasta")
secuencias_adn <- readDNAStringSet("todas_secuencias.fasta", format = "fasta")
## Warning in .Call2("fasta_index", filexp_list, nrec, skip, seek.first.rec, :
## reading FASTA file todas_secuencias.fasta: ignored 19917 invalid one-letter
## sequence codes
secuencias_adn
## DNAStringSet object of length 8:
## width seq names
## [1] 29903 ATTAAAGGTTTATACCTTCCCAG...AAAAAAAAAAAAAAAAAAAAAAA NC_045512.2
## [2] 29751 ATATTAGGTTTTTACCTACCCAG...AAAAAAAAAAAAAAAAAAAAAAA NC_004718.3
## [3] 29879 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OX008586.1
## [4] 29884 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OW998408
## [5] 29890 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OX014251
## [6] 29866 NNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNN OW996240
## [7] 29838 TAACAAACCAACCAACTTTCGAT...AAAAAAAAAAAAAAAAAAAAAAA MZ937000
## [8] 30004 ACTTTTAAAGTAAAAGTGAGTGT...CCTTTTGATAGTGATACACAAAA KP849472
Grafico con las longuitudes
longitudes <- width(secuencias_adn)
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:XVector':
##
## slice
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
datos <- data.frame(
x = c("Covid 19", "sarsCov", "covBeta", "covAlpha","covGamma", "covOmicron", "covMurcielago", "Alphacoronavirus_1"),
y = longitudes)
p <- ggplot(datos, aes(x = x, y = y))+
geom_bar(stat = "identity") +
scale_y_continuous(breaks = seq(29751, 30020, by = 20))+
labs(
title = "Longuitud secuencias",
x = "ADN",
y = "Numero nucleotidos"
)
ggplotly(p)
Calculando la composicion de nucleotidos:
composicion_nucleotidos <- alphabetFrequency(secuencias_adn, baseOnly = TRUE)
composicion_nucleotidos
## A C G T other
## [1,] 8954 5492 5863 9594 0
## [2,] 8481 5940 6187 9143 0
## [3,] 8678 5324 5701 9319 857
## [4,] 8890 5462 5844 9567 121
## [5,] 8790 5393 5778 9442 487
## [6,] 8548 5256 5615 9255 1192
## [7,] 8938 5492 5837 9571 0
## [8,] 9083 5114 6015 9792 0
Graficando la composicion de nucleotidos
barplot(composicion_nucleotidos, main = "ComposiciĂ³n de nucleĂ³tidos", xlab = "NucleĂ³tidos", ylab = "Frecuencia")
Calculando el porcentaje de CG de cada variante
cantidad_cg_todos = c()
porcenteajes_cg_todos = c()
nombre = c("Covid 19", "SARS-COV", "Betacoronavirus", "Alphacoronavirus","Gammacoronavirus", "Omicron", "Banal-52", "covCanino")
for (i in 1:8){
porcentaje = (composicion_nucleotidos[i,2] + composicion_nucleotidos[i,3]) * 100 / (composicion_nucleotidos[i,1]+composicion_nucleotidos[i,2] + composicion_nucleotidos[i,3] + composicion_nucleotidos[i,4])
cantidad_cg = composicion_nucleotidos[i,2] + composicion_nucleotidos[i,3]
cantidad_cg_todos = append(cantidad_cg_todos, cantidad_cg)
porcenteajes_cg_todos = append(porcenteajes_cg_todos, porcentaje)
cat("El porcentaje de CG del virus", nombre[i]," es:", porcentaje, "%\n")}
## El porcentaje de CG del virus Covid 19 es: 37.97278 %
## El porcentaje de CG del virus SARS-COV es: 40.76166 %
## El porcentaje de CG del virus Betacoronavirus es: 37.98842 %
## El porcentaje de CG del virus Alphacoronavirus es: 37.98676 %
## El porcentaje de CG del virus Gammacoronavirus es: 37.99272 %
## El porcentaje de CG del virus Omicron es: 37.91239 %
## El porcentaje de CG del virus Banal-52 es: 37.96836 %
## El porcentaje de CG del virus covCanino es: 37.09172 %
Generando una grafica que grafique el porcentaje de CG de cada virus
datos <- data.frame(
x = c("Covid 19", "sarsCov", "covBeta", "covAlpha","covGamma", "covOmicron", "covMurcielago", "covCanino"),
y = porcenteajes_cg_todos)
p <- ggplot(datos, aes(x = x, y = y))+
geom_bar(stat = "identity") +
scale_y_continuous(breaks = seq(0,45, by = 2))+
labs(
title = "Porcentaje CG por virus",
x = "Virus",
y = "Porcentaje"
)
ggplotly(p)
Cerando dataFrame con datos de los genomas
nombres = c("Covid 19", "sarsCov", "covBeta", "covAlpha","covGamma", "covOmicron", "covMurcielago", "Alphacoronavirus_1")
filas = c("ID", "CANTIDAD CG", "PORCENTAJE CG")
id = c("NC_045512.2","NC_004718.3","OX008586.1","OW998408","OX014251","OW996240","MZ937000","KP849472")
df <- data.frame(matrix(nrow = 3, ncol = 8))
colnames(df) <- nombres
rownames(df) <- filas
df[1,] <- id
df[2,] <- cantidad_cg_todos
df[3,] <- porcenteajes_cg_todos
# Print the data frame
cat("El marco de datos resultante es:\n")
## El marco de datos resultante es:
print(df)
## Covid 19 sarsCov covBeta
## ID NC_045512.2 NC_004718.3 OX008586.1
## CANTIDAD CG 11355 12127 11025
## PORCENTAJE CG 37.9727786509715 40.7616550704178 37.9884225759768
## covAlpha covGamma covOmicron
## ID OW998408 OX014251 OW996240
## CANTIDAD CG 11306 11171 10871
## PORCENTAJE CG 37.9867620871552 37.9927218311057 37.9123945037316
## covMurcielago Alphacoronavirus_1
## ID MZ937000 KP849472
## CANTIDAD CG 11329 11129
## PORCENTAJE CG 37.9683624907836 37.0917211038528
Alineando las secuencias
virus_seq_align <- AlignSeqs(secuencias_adn)
## Determining distance matrix based on shared 11-mers:
## ================================================================================
##
## Time difference of 0.67 secs
##
## Clustering into groups by similarity:
## ================================================================================
##
## Time difference of 0.01 secs
##
## Aligning Sequences:
## ================================================================================
##
## Time difference of 34.72 secs
##
## Iteration 1 of 2:
##
## Determining distance matrix based on alignment:
## ================================================================================
##
## Time difference of 0.01 secs
##
## Reclustering into groups by similarity:
## ================================================================================
##
## Time difference of 0 secs
##
## Realigning Sequences:
## ================================================================================
##
## Time difference of 31.68 secs
##
## Iteration 2 of 2:
##
## Determining distance matrix based on alignment:
## ================================================================================
##
## Time difference of 0.01 secs
##
## Reclustering into groups by similarity:
## ================================================================================
##
## Time difference of 0 secs
##
## Realigning Sequences:
## ================================================================================
##
## Time difference of 35.41 secs
BrowseSeqs(virus_seq_align)
Creando la matriz distancia
writeXStringSet(virus_seq_align, file="coronavirus_seq_align.fasta")
virus_aligned = read.alignment("coronavirus_seq_align.fasta", format = "fasta")
matriz_distancia = dist.alignment(virus_aligned, matrix = "similarity")
matriz_distancia = as.data.frame(as.matrix(matriz_distancia))
matriz_distancia
Creando una tabla a escala de grices d ela matriz distancia
table.paint(matriz_distancia, cleg = 0, clabel.row = 0.5, clabel.col = 0.5) + scale_color_viridis()
## NULL
Creando un arbol filogenetico
writeXStringSet(virus_seq_align, file="coronavirus_seq_align.fasta")
virus_aligned = read.alignment("coronavirus_seq_align.fasta", format = "fasta")
matriz_distancia = dist.alignment(virus_aligned, matrix = "similarity")
virus_tree <- nj(matriz_distancia)
virus_tree <- ladderize(virus_tree)
plot_virus_filogenia <- ggtree(virus_tree) + geom_tiplab() + ggtitle("Phylogenetic analysis of SARS-CoV genomes")
plot_virus_filogenia